* README: This is just like 2_data_process but saves ALL tax rates
/************************************************
* Data Processing 2
* This file starts from the same data as before, but has the intention to "Clean it" 
* and make it ready for merging across ASEC and CPS montly
* Alot of the Clean up would be "copy paste"  from 1_data_process.do
************************************************/
*use "C:\Users\user\Dropbox2\Dropbox (Personal)\projects\00 New Projects\TrumpTax\CPS2015-2017.dta", clear


use ".\data\CPS2015-2017.dta", clear

gen sort_id=_n
** Most variables are coded with 99999.... when not available. For our purposes they need to be
** zeroed

foreach i of varlist inc* {
	qui:sum `i'
	display "`i': max value: " r(max)
	replace `i'=0 if `i'==r(max)
}

** create household level income
foreach i of varlist inc* {
	bysort year month serial:egen h`i'=sum(`i')
	local lbl:variable label `i'
	label var h`i' "Household `lbl'"
}
** Flag households when The sum of total income is different from data Household income
** This is to avoid problems with measurment errors
** This are 4012 records 

gen byte flag=0
label define flag 0 "no errors", modify
label values flag flag 
replace flag=1 if hinctot!=hhincome & hhincome!=.

label define flag 1 "Hhincome != sum(inctot)", modify
** we will also be Flaggin households with negative or zero income
** Additional are 3729 records
replace flag=2 if hinctot<=0 & hhincome!=. & flag==0
label define flag 2 "Negative total income", modify
** The next step is Flagging Couple and Single Households
** This will be defined by whoever declares to be the Householder 
** This ignores, for example, if the grandfather (for being the oldest) is a widow
** who is living with his child and child's spouse. From a tax perspective, 
** The Child and Childs Spouse may be the one filling taxes.

** Trick: Ill recode unmarried parter as Spouse. 
** This may help later. but will keep the code for filling sep


	by year month serial :egen fill_sep=sum(relate==1114)
	replace relate=201 if relate==1114

	by year month serial :egen marriedhh=sum(relate<=201)
	label var    marriedhh "Married Household"
	label define marriedhh  0 "error" 1 "Single" 2 "Couple"
	label values marriedhh  marriedhh 
	replace  flag=3 if marriedhh==0 & flag==0
	label define flag 3 "Households without a head", modify
	
	gen byte flag_couple=0 if marriedhh==1
	replace  flag_couple=1 if marriedhh==2

	by year month serial :egen nohead=sum(relate==101)
	
** At this point i ll create a Quintile by household
	by year month serial: gen n_flag=_n
	egen q_income=xtile(hhincome) if n_flag==1, weight(asecwth) by(year) n(5)
	drop n_flag		
	by year month serial:replace q_income=q_income[_n-1] if _n>1
	
** flag households with Other members
** What to do with "non_nuclear households"? We exclude them because we concentrate on nuclear households
** The model , and taxes, assumes only 1 or 2 tax payers. 
** The model also assumes there is only 1 family in the household. 
** For this simply FLAG using famunit, and keep Fam 1
	replace	 flag=4 if famunit>1 & flag==0
	label define flag 4 "Secondary or tertiary family", modify


** We also need to constrain the Nuclear households. Thus we need to drop households with other members.
	by  year month serial :egen flag_nnuclear=sum(relate>301)
	replace flag=5 if flag_nnuclear>0 & flag==0
	label define flag 5 "Non nuclear Households", modify

** There are 98,287 records with this. So we need to stablish that we are working with Nuclear households
** From here on we create some demographics
** All households are parents and Children
 
** Children per household
	by  year month serial:egen nkids05  =sum((age>=0 & age<=5)*(relate==301))
	by  year month serial:egen nkids613 =sum((age>=6 & age<=13)*(relate==301))
** nkids 14-17 required for taxes purposes
	by  year month serial:egen nkids1417=sum((age>=14 & age<=17)*(relate==301))
	by  year month serial:egen nkids1418=sum((age>=14 & age<=18)*(relate==301))
	by year month serial:egen nkids19p =sum((age>=19 & age<=90)*(relate==301))
	label var nkids05   "Number of kids 0-5"
	label var nkids613  "Number of kids 6-12"
	label var nkids1418 "Number of kids 13-18"
	label var nkids19p  "Number of kids older than 19"
** This last one is to obtain an additional level of data constrain. 
** Households where children are "old" 

	replace flag=6 if nkids19p>0 & flag==0
	label define flag 6 "HH with older children", modify
** Perhaps will have drop households with older children.

** getting some variables for husband and wife (spouse/partner)
** This has become trickier with the presence of same sex households. 
** so we do a different approach:

	gen byte head_spouse=(relate<250)
	** Create a variable to FLAG Head and Spouse by sex ONCE!
	gsort year month serial -head_spouse sex -age  sort_id
	gen long id_sort=_n 
	by year month serial:gen pid_sort=_n
** Id same sex couples: For now I ll keep Same sex couples
	by    year month serial :gen same_sex=(sex[1]==sex[2])
	replace same_sex=. if   flag_couple==0
	
** From here forward, I create data for Husbands and wifes. Husband is 1, wife is 2.	
** Age for husband and wife, plus other characteristics
	by    year month serial:gen age_1=age[1]  
	by    year month serial:gen age_2=age[2] 
	replace age_2=0 if flag_couple==0

	label var age_1 "Husband's age"
	label var age_2 "Spouse's  age"
** Recoding education
	recode educ (0/71=1 "Less than HS") (72 73=2 "HighSchool") (80/110=3 "Some College") (111/122=4 "College") (123/125=5 "Grad" ), gen(educl)
	label var educl "Education level by group"

	by    year month serial:gen educ_1=educl[1]
	by    year month serial:gen educ_2=educl[2]
	replace educ_2=0 if flag_couple==0

	label var educ_1 "Husband's level by group"
	label var educ_2 "Spouse's  level by group"
	label values educ_1 educ_2 educl
** Race 
	
	gen byte race_g=1 if int(race/100)==1
	replace  race_g=2 if int(race/100)==2
	replace  race_g=3 if hispan!=0
	replace  race_g=4 if race_g==.
	label define race_g 1 White 2 Black 3 Hispanic 4 Other
	label values race_g race_g
	label var    race_g "Group Race variable "
	by year month serial:gen race_1=race_g[1] 
	by year month serial:gen race_2=race_g[2]
	label var race_1 "Husband's Race"
	label var race_2 "Spouse's Race"
	replace race_2=0 if flag_couple==0
	label values race_1 race_2 race_g
	** Aux variables * Do we need this?
	*gen agesq_1=age_1^2
	*gen agesq_2=age_2^2
	** Region and division
	ren region division
	gen region=int(division/10)
	label define region2 1 "NorthEast" 2 "MidWest" 3 "South" 4 "West"
	label values region region2
	

** Drop individuals with ANY selfemp income NOT ANYMORE Here we make the changes? Later
*bysort year month serial:egen incbus_1=max(incbus*(sex==1)*(relate<250))
*bysort year month serial:egen incbus_2=max(incbus*(sex==2)*(relate<250))
*bysort year month serial:egen class_1=max(classwkr*(sex==1)*(relate<250))
*bysort year month serial:egen class_2=max(classwkr*(sex==2)*(relate<250))

////////////////////////////////////////////////////////////////////////
// Here we merge the data with information from CEX and SCF. using frames

capture:frame create cex
capture:frame create scf

*capture      cd "C:\Users\Fernando\Dropbox (Personal)\projects\00 New Projects\TrumpTax\"
frame scf:use ".\data\Trump_SCF_final data_new.dta", clear
frame cex:use ".\data\data_TaxSim_CEX_Division_09_22_20.dta", clear


frame cex:drop median*
frame scf:sum
frame cex:sum

ren married marreid_plus
gen married=1     if marreid_plus==2
replace married=0 if marreid_plus==1
ren division division_plus
egen division=group(division_plus)
ren q_income quintile

frlink m:1  married quintile , frame(scf)
frlink m:1  year division married quintile , frame(cex)

frget capital, from(scf)
frget mean*, from(cex)

foreach i of varlist mean* {
	replace `i'=0 if `i'==.
}

	*** Total Income
	by year month serial:gen inctot_1=inctot[1]  
	by year month serial:gen inctot_2=inctot[2] 
	replace inctot_2=0 if flag_couple==0
	
	by year month serial:gen incwage_1=incwage[1]  
	by year month serial:gen incwage_2=incwage[2] 
	
	by year month serial:gen incbus_1=incbus[1]  
	by year month serial:gen incbus_2=incbus[2] 
	
	by year month serial:gen incfarm_1=incfarm[1]  
	by year month serial:gen incfarm_2=incfarm[2] 
	
	replace incwage_2=0 if flag_couple==0
	replace incbus_2 =0 if flag_couple==0
	replace incfarm_2=0 if flag_couple==0
	
	by year month serial:egen other_total_income=sum(inctot*(relate>250))

	gen nly=  inctot_1 +inctot_2   ///
	         -incwage_1-incbus_1-incfarm_1 ///
			 -incwage_2-incbus_2-incfarm_2 ///
			 +other_total_income

	foreach i of varlist ?hrs* {
		replace `i'=. if `i'==999
		replace `i'=. if `i'==997
		replace `i'=. if `i'==0
	}
	
	by year month serial:egen ahincearn_c=sum(inctot*(relate>250))
		replace flag=8 if ahincearn_c>50000
	replace flag=8 if ahincearn_c<0
	label define flag 8 "Household with High income children", modify	
	replace flag=9 if hincrent<0
	label define flag 9 "HH with negative Rent income", modify	

	** Hour wage
	replace earnweek=. if earnweek==9999.99
	replace hourwage=. if hourwage==99.99
	replace earnweek=. if earnweek==0
	** People with 
	 
	** any earnings
	gen flag_earnings=0
	replace flag_earnings=1 if earnweek!=. & earnweek!=0
	replace flag_earnings=1 if incwage!=. & incwage!=0
	replace flag_earnings=1 if incbus!=. & incbus!=0
	replace flag_earnings=1 if incfarm!=. & incfarm!=0
	** This imputes HOURS for As many people as possible 
	gen     hours_work=.
	replace hours_work=.
	replace hours_work=uhrswork1
	replace hours_work=uhrsworkt if uhrswork2!=.
	replace hours_work=ahrsworkt if hours_work==.  
 	replace hours_work=uhrsworkorg if hours_work==. 
	replace hours_work=uhrsworkly  if hours_work==. 
	replace hours_work=40 if  (hours_work==. & flag_earnings==1)
	** Cap hours to 112. This implies works 16hrs a day. or 112 hrs per week
	** assumes an allocation of 8hrs for sleep and others
	replace hours_work=112 if hours_work!=. & hours_work>112
	
	** Perhaps Constrain so Only those at work or with a JOB have hours of work.
	** As expected, non in the labor force do not work. But some other statuses Work.
	/*
	gen hrs_work_pos=(hours_work!=0 & hours_work!=.)
	tab empstat hrs_work_pos

						  |     hrs_work_pos
		employment status |         0          1 |     Total
	----------------------+----------------------+----------
					  niu |   130,268          0 |   130,268 
			 armed forces |     1,178      1,041 |     2,219 
				  at work |         0    314,622 |   314,622 
	has job, not at work  |       387     10,756 |    11,143 
	unemployed, experienc |     8,824      5,247 |    14,071 
	unemployed, new worke |     1,463         39 |     1,502 
	 nilf, unable to work |    32,016        997 |    33,013 
			  nilf, other |    72,843      6,752 |    79,595 
			nilf, retired |   102,262      3,567 |   105,829 
	----------------------+----------------------+----------
					Total |   349,241    343,021 |   692,262 

	*/
 
	** This will  "Impute wages or earnings per hour for all.
	gen     hour_wage=.
	replace hour_wage=.
	replace hour_wage=hourwage
	replace hour_wage=earnweek/hours_work if hour_wage==.
	replace hour_wage=. if hour_wage==0
	*** Above applies mostly for CPS-MORG data
	
	by year month serial:gen hours_work_1=hours_work[1]  
	by year month serial:gen hours_work_2=hours_work[2] 
	by year month serial:gen hour_wage_1=hour_wage[1]  
	by year month serial:gen hour_wage_2=hour_wage[2] 
	replace hours_work_2=. if flag_couple==0
	replace hour_wage_2 =. if flag_couple==0
	
 
	*** here we will now get the data we need from CPS march to ASEC CPS
	*** we are using "frames" Stata 16 or above.
 	
	capture:frame drop new
	frame copy  default new
	frame new {
		drop if month==0
		keep year month cpsid hour_wage_1 hour_wage_2 hours_work_1 hours_work_2
		duplicates drop
		}
     
	** Now work only on March CPS. With this, we may also do the hourly earnings imputation

	keep if month==0
	drop hour_wage_1 hour_wage_2 hours_work_1 hours_work_2
	frlink  m:1 year cpsid , frame(new)
	frget hour_wage_1 hour_wage_2 hours_work_1 hours_work_2 , from(new)
	
	** Keep only 1 family per Households 
	*replace flag=10 if (incbus_1<0  | incbus_2<0 ) & flag==0
	*replace flag=10 if (incfarm_1<0 | incfarm_2<0) & flag==0
	replace flag=10 if (inctot_1<0  | inctot_2<0)  & flag==0
	replace flag=10 if (other_total_income<0 | nly<0) & flag==0
	replace flag=10 if (gq==2) & flag==0
	
	label define flag 10 "Unusual income. Negative total income, negative other income, negative NLY", modify
	replace flag=11 if log(nly+1)>13 & flag==0
	label define flag 11 "Unusually high NLY", modify
	replace flag=12 if flag==0 & ( incfarm_1<0 |  incfarm_2 < 0 | incbus_2<0 | incbus_1 <0)
	label define flag 12 "Negative business or farm income", modify
	tab flag if pid_sort==1
	by year month serial:gen popstat_1=popstat[1]   
	by year month serial:gen popstat_2=popstat[2]    if flag_couple==1
	replace flag = 13 if flag==0 & ( popstat_1==2 |  popstat_2==2)
 	label define flag 13 "Both Army  ", modify	
	 	tab flag if pid_sort==1


	/*

                                   flag |      Freq.     Percent        Cum.
----------------------------------------+-----------------------------------
                              no errors |    105,562       73.89       73.89
                Hhincome != sum(inctot) |      1,637        1.15       75.04
                  Negative total income |      1,995        1.40       76.43
              Households without a head |          4        0.00       76.44
                 Non nuclear Households |     13,224        9.26       85.69
                 HH with older children |     12,175        8.52       94.21
    Household with High income children |      5,568        3.90       98.11
           HH with negative Rent income |      1,036        0.73       98.84
Unusual income. Negative total income,  |        279        0.20       99.03
                     Unusually high NLY |          3        0.00       99.03
       Negative business or farm income |        507        0.35       99.39
                            Both Army   |        873        0.61      100.00
----------------------------------------+-----------------------------------
                                  Total |    142,863      100.00

	*/
	
	 
	********************************************************



	** Alternative measure of Wage per hour
	gen hour_wage_alt=(incwage+incbus+incfarm)/(wkswork1*uhrsworkly)
	
	by year month serial:replace hour_wage_1=hour_wage_alt[1]  if hour_wage_1==.
	by year month serial:replace hour_wage_2=hour_wage_alt[2]  if hour_wage_2==.
	replace hour_wage_2 =. if flag_couple==0
	
	by year month serial:replace hours_work_1=uhrsworkly[1]  if hours_work_1==.
	by year month serial:replace hours_work_2=uhrsworkly[2]  if hours_work_2==.
	replace hours_work_2=. if flag_couple==0
	
	by year month serial:gen diffany_1=diffany[1]
	by year month serial:gen diffany_2=diffany[2]
	replace diffany_2=. if flag_couple==0

	** Last Consistency checks
	replace hour_wage_1=. if hour_wage_1<=0
	replace hour_wage_2=. if hour_wage_2<=0
	** Upper Boounds
	replace hour_wage_1=300 if hour_wage_1>300 & hour_wage_1!=.
	replace hour_wage_2=300 if hour_wage_2>300 & hour_wage_2!=.
	** lower Boounds 1
	replace hour_wage_1=1 if hour_wage_1<1 & hour_wage_1!=.
	replace hour_wage_2=1 if hour_wage_2<1 & hour_wage_2!=.
	** There are SOME cases where Hours are not matched with Actual wages. They will be imputed
	ren hour_wage_1 wage_1
	ren hour_wage_2 wage_2
	ren hours_work_1 hrs_1
	ren hours_work_2 hrs_2
	

	
	compress 
	
 
	sum wage_1 incwage_1 incfarm_1 incbus_1 hrs_1   if incwage_1>0
	sum wage_2 incwage_2 incfarm_2 incbus_2 hrs_2   if incwage_2>0
 

	global tokeep year serial month cpsid cpsidp same_sex wkswork1 ///
				  hhtenure diffany* division region statefip metro metarea cpi99 cbsasz ///
				  hhincome vetstat relate sex asecwt  famsize  hrs_1 hrs_2 wage_1 wage_2 marst ///
				  age nchild  bpl yrimmig citizen fbpl mbpl nativity empstat ///
				  labforce  occ2010 ind1990  classwkr schlcoll educ popstat earnwt ///
				  inc*_?  taxinc migsta1 union nkids05 nkids613 nkids1418 /// 
				  nkids1417 nkids19p age_1 age_2 educ_1 educ_2 race_1 race_2   ///
				  region classwkr classwly metfips ownershp pubhous heatsub foodstmp lunchsub ///
				  nly wage_1 wage_2 hrs_1 hrs_2 mis   paidhour fullpart  adjginc sort_id flag
				  
	keep  $tokeep     
	label var nly "Non labor income"
 
	gen nord=_n
		by year month serial :gen pid_sort=_n

	*** Now Merge it with Marginal Tax Rates
 
	merge m:1 year month serial using "data\Taxsim_output.dta"
	
	drop if relate>=301
	tab _m
	sort nord
	gsort year month serial sex -age sort_id
  
	** Additional clean up. Drop if either Husband or wife is in the army
	
	by year month serial :replace pid_sort=_n
	tab flag   if pid_sort==1
	
	keep if flag==0 & _m==3
	
 	drop _m
	*** Drop tax info not used yet
	drop t_v10-t_v42
	drop f_v10-f_v42
	** Sorting data for "faster assesment 
 
	
	by year month serial:gen couple_flag=_N
 
 
	foreach i in age sex marst popstat vetstat bpl yrimmig citizen mbpl wkswork1 ///
				 fbpl nativity empstat labforce occ2010 ind1990 classwkr educ  /// 
				 schlcoll diffany earnwt classwly fullpart  ///
				 migsta1 paidhour union  {
			capture confirm new var `i'_1
			if _rc==0 {
				local lv:variable label `i'
				local vlv:value label `i'
				by year month serial:gen `i'_1=`i'[1]  
				by year month serial:gen `i'_2=`i'[2] 
				replace `i'_2=. if  couple_flag==1
				label var `i'_1 "`lv'"
				label var `i'_2 "`lv'"
				label values `i'_1 `i'_2 `vlv'
			}
			drop `i'
		}
	by year month serial:replace asecwt=asecwt[1]  
	compress
	by year month serial:gen e_flag=_n
	keep if e_flag==1
	note: This dataset has ALL tax rates
	save ".\data\CPS2015-2017_with_tax_sim_2.dta", replace
  
 
 ** Hour wage
 